{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## Automated Snow Leopard Detection with Synapse Machine Learning\n",
    "\n",
    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLTrust.PNG\" width=\"900\" style=\"float: left;\"/>"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "from synapse.ml.core.platform import *\n",
    "\n",
    "bing_search_key = find_secret(\n",
    "    secret_name=\"bing-search-key\", keyvault=\"mmlspark-build-keys\"\n",
    ")\n",
    "\n",
    "# WARNING this notebook requires a lot of memory.\n",
    "# If you get a heap space error, try dropping the number of images bing returns\n",
    "# or by writing out the images to parquet first"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "from synapse.ml.services import *\n",
    "from synapse.ml.core.spark import FluentAPI\n",
    "from pyspark.sql.functions import lit\n",
    "\n",
    "\n",
    "def bingPhotoSearch(name, queries, pages):\n",
    "    offsets = [offset * 10 for offset in range(0, pages)]\n",
    "    parameters = [(query, offset) for offset in offsets for query in queries]\n",
    "\n",
    "    return (\n",
    "        spark.createDataFrame(parameters, (\"queries\", \"offsets\"))\n",
    "        .mlTransform(\n",
    "            BingImageSearch()  # Apply Bing Image Search\n",
    "            .setSubscriptionKey(bing_search_key)  # Set the API Key\n",
    "            .setOffsetCol(\"offsets\")  # Specify a column containing the offsets\n",
    "            .setQueryCol(\"queries\")  # Specify a column containing the query words\n",
    "            .setCount(10)  # Specify the number of images to return per offset\n",
    "            .setImageType(\"photo\")  # Specify a filter to ensure we get photos\n",
    "            .setOutputCol(\"images\")\n",
    "        )\n",
    "        .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n",
    "        .withColumn(\"labels\", lit(name))\n",
    "        .limit(400)\n",
    "    )"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/SparkSummit2/cog_services.png\" width=\"900\" style=\"float: left;\"/>"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def displayDF(df, n=5, image_cols=set([\"urls\"])):\n",
    "    rows = df.take(n)\n",
    "    cols = df.columns\n",
    "    header = \"\".join([\"<th>\" + c + \"</th>\" for c in cols])\n",
    "\n",
    "    style = \"\"\"\n",
    "<!DOCTYPE html>\n",
    "<html>\n",
    "<head>\n",
    "<style>\n",
    "table {\n",
    "    font-family: arial, sans-serif;\n",
    "    border-collapse: collapse;\n",
    "    width: 300;\n",
    "}\n",
    "\n",
    "td, th {\n",
    "    border: 1px solid #dddddd;\n",
    "    text-align: left;\n",
    "    padding: 8px;\n",
    "}\n",
    "\n",
    "tr:nth-child(even) {\n",
    "    background-color: #dddddd;\n",
    "}\n",
    "</style>\n",
    "</head>\"\"\"\n",
    "\n",
    "    table = []\n",
    "    for row in rows:\n",
    "        table.append(\"<tr>\")\n",
    "        for col in cols:\n",
    "            if col in image_cols:\n",
    "                rep = '<img src=\"{}\",  width=\"100\">'.format(row[col])\n",
    "            else:\n",
    "                rep = row[col]\n",
    "            table.append(\"<td>{}</td>\".format(rep))\n",
    "        table.append(\"</tr>\")\n",
    "    tableHTML = \"\".join(table)\n",
    "\n",
    "    body = \"\"\"\n",
    "<body>\n",
    "<table>\n",
    "  <tr>\n",
    "    {} \n",
    "  </tr>\n",
    "  {}\n",
    "</table>\n",
    "</body>\n",
    "</html>\n",
    "  \"\"\".format(\n",
    "        header, tableHTML\n",
    "    )\n",
    "    try:\n",
    "        if running_on_databricks():\n",
    "            displayHTML(style + body)\n",
    "        else:\n",
    "            import IPython\n",
    "\n",
    "            IPython.display.HTML(style + body)\n",
    "    except:\n",
    "        pass"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "snowLeopardQueries = [\"snow leopard\"]\n",
    "snowLeopardUrls = bingPhotoSearch(\"snow leopard\", snowLeopardQueries, pages=100)\n",
    "displayDF(snowLeopardUrls)"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "randomWords = spark.read.parquet(\n",
    "    \"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\"\n",
    ").cache()\n",
    "randomWords.show()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "randomLinks = (\n",
    "    randomWords.mlTransform(\n",
    "        BingImageSearch()\n",
    "        .setSubscriptionKey(bing_search_key)\n",
    "        .setCount(10)\n",
    "        .setQueryCol(\"words\")\n",
    "        .setOutputCol(\"images\")\n",
    "    )\n",
    "    .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n",
    "    .withColumn(\"label\", lit(\"other\"))\n",
    "    .limit(400)\n",
    ")\n",
    "\n",
    "displayDF(randomLinks)"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "images = (\n",
    "    snowLeopardUrls.union(randomLinks)\n",
    "    .distinct()\n",
    "    .repartition(100)\n",
    "    .mlTransform(\n",
    "        BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000)\n",
    "    )\n",
    "    .dropna()\n",
    ")\n",
    "\n",
    "train, test = images.randomSplit([0.7, 0.3], seed=1)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "from pyspark.ml import Pipeline\n",
    "from pyspark.ml.feature import StringIndexer\n",
    "from pyspark.ml.classification import LogisticRegression\n",
    "from pyspark.sql.functions import udf\n",
    "from synapse.ml.onnx import ImageFeaturizer\n",
    "from synapse.ml.stages import UDFTransformer\n",
    "from pyspark.sql.types import *\n",
    "\n",
    "\n",
    "def getIndex(row):\n",
    "    return float(row[1])\n",
    "\n",
    "\n",
    "model = Pipeline(\n",
    "    stages=[\n",
    "        StringIndexer(inputCol=\"labels\", outputCol=\"index\"),\n",
    "        ImageFeaturizer(\n",
    "            inputCol=\"image\",\n",
    "            outputCol=\"features\",\n",
    "            autoConvertToColor=True,\n",
    "            ignoreDecodingErrors=True,\n",
    "        ).setModel(\"ResNet50\"),\n",
    "        LogisticRegression(maxIter=5, labelCol=\"index\", regParam=10.0),\n",
    "        UDFTransformer()\n",
    "        .setUDF(udf(getIndex, DoubleType()))\n",
    "        .setInputCol(\"probability\")\n",
    "        .setOutputCol(\"leopard_prob\"),\n",
    "    ]\n",
    ")\n",
    "\n",
    "fitModel = model.fit(train)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLPipeline.PNG\" width=\"900\" style=\"float: left;\"/>"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def plotConfusionMatrix(df, label, prediction, classLabels):\n",
    "    from synapse.ml.plot import confusionMatrix\n",
    "    import matplotlib.pyplot as plt\n",
    "\n",
    "    fig = plt.figure(figsize=(4.5, 4.5))\n",
    "    confusionMatrix(df, label, prediction, classLabels)\n",
    "    display(fig)\n",
    "\n",
    "\n",
    "if not running_on_synapse():\n",
    "    plotConfusionMatrix(\n",
    "        fitModel.transform(test), \"index\", \"prediction\", fitModel.stages[0].labels\n",
    "    )"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import urllib.request\n",
    "from synapse.ml.explainers import ImageLIME\n",
    "\n",
    "test_image_url = (\n",
    "    \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n",
    ")\n",
    "with urllib.request.urlopen(test_image_url) as url:\n",
    "    barr = url.read()\n",
    "test_subsample = spark.createDataFrame([(bytearray(barr),)], [\"image\"])\n",
    "\n",
    "lime = (\n",
    "    ImageLIME()\n",
    "    .setModel(fitModel)\n",
    "    .setTargetCol(\"leopard_prob\")\n",
    "    .setOutputCol(\"weights\")\n",
    "    .setInputCol(\"image\")\n",
    "    .setCellSize(100.0)\n",
    "    .setModifier(50.0)\n",
    "    .setNumSamples(300)\n",
    ")\n",
    "\n",
    "result = lime.transform(test_subsample)"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import PIL, io, numpy as np\n",
    "\n",
    "\n",
    "def plot_superpixels(row):\n",
    "    image_bytes = row[\"image\"]\n",
    "    superpixels = row[\"superpixels\"][\"clusters\"]\n",
    "    weights = list(row[\"weights\"][0])\n",
    "    mean_weight = np.percentile(weights, 90)\n",
    "    img = (PIL.Image.open(io.BytesIO(image_bytes))).convert(\"RGBA\")\n",
    "    image_array = np.asarray(img).copy()\n",
    "    for (sp, w) in zip(superpixels, weights):\n",
    "        if w > mean_weight:\n",
    "            for (x, y) in sp:\n",
    "                image_array[y, x, 1] = 255\n",
    "                image_array[y, x, 3] = 200\n",
    "    plt.clf()\n",
    "    plt.imshow(image_array)\n",
    "    display()\n",
    "\n",
    "\n",
    "# Gets first row from the LIME-transformed data frame\n",
    "if not running_on_synapse():\n",
    "    plot_superpixels(result.take(1)[0])"
   ],
   "outputs": [],
   "metadata": {
    "collapsed": true
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Your results will look like:\n",
    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/lime_results.png\" width=\"900\" style=\"float: left;\"/>"
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
